library(rbiom)
library(kableExtra)
library(ggplot2)
library(ggpubr)
library(ggforce)
library(patchwork)
library(vegan)
library(dplyr)
library(forcats)
The file handles are set in config.R as they’re used by both this script and data_cleaning.
source("/Users/flashton/Dropbox/GordonGroup/STRATAA_Microbiome/from_Leo/Leonardos_analysis/bin/core_functions.R")
source("/Users/flashton/Dropbox/GordonGroup/STRATAA_Microbiome/from_Leo/Leonardos_analysis/bin/config.R")
First do the plots with kruskall wallis comparison between groups to see if there’s an overall difference, then do the pairwise tests because KW says there is a difference.
metadata <- read_metadata(metadata_handle)
metadata <- metadata %>% mutate(Group = if_else(Group == 'Control_HealthySerosurvey', 'Healthy Control', Group))
metadata <- metadata %>% mutate(Group = if_else(Group == 'Acute_Typhi', 'Acute typhoid', Group))
number_per_country <- metadata %>% group_by(Country) %>% summarise(count = n())
number_per_country %>% kbl() %>% kable_styling()
| Country | count |
|---|---|
| Bangladesh | 120 |
| Malawi | 114 |
| Nepal | 76 |
print(sum(number_per_country$count))
## [1] 310
metadata %>% group_by(Group) %>% summarise(count = n()) %>% kbl() %>% kable_styling()
| Group | count |
|---|---|
| Acute typhoid | 103 |
| Carrier | 110 |
| Healthy Control | 97 |
baseline_chars <- get_baseline_characteristics(metadata)
baseline_chars$pct_anti
## [1] 37.50000 0.00000 0.00000 26.47059 0.00000 0.00000 44.82759 0.00000
## [9] 0.00000
baseline_chars$pct_anti %>% na_if(0)
## [1] 37.50000 NA NA 26.47059 NA NA 44.82759 NA
## [9] NA
# BEWARE - na_if(0.0) will convert ALL 0s to NAs, this is ok as they're only in the antibiotics of carriers and controls
# at the moment, but if others get added in, will screw with it.
# baseline_chars %>% rename(`Median age` = median_age, `Women (%)` = pct_fem, `Antibiotics in last 2 weeks (%)` = pct_anti) %>% pivot_longer(!c(Country, Group)) %>% rename(variable_name = name) %>% pivot_wider(names_from = Group, values_from = value) %>% filter(variable_name != 'number') %>% na_if(0.0) %>% kbl(digits = c(NA, NA, 1, 1, 1)) %>% kable_styling()
# na_if stopped working, so had to do this instead
baseline_chars_table <- baseline_chars %>% rename(`Median age` = median_age, `Women (%)` = pct_fem, `Antibiotics in last 2 weeks (%)` = pct_anti) %>% pivot_longer(!c(Country, Group)) %>% rename(variable_name = name) %>% pivot_wider(names_from = Group, values_from = value) %>% filter(variable_name != 'number')
baseline_chars_table$Carrier <- replace(baseline_chars_table$Carrier,which(baseline_chars_table$Carrier==0),NA)
baseline_chars_table$`Healthy Control` <- replace(baseline_chars_table$`Healthy Control`,which(baseline_chars_table$`Healthy Control`==0),NA)
baseline_chars_table %>% kbl(digits = c(NA, NA, 1, 1, 1)) %>% kable_styling()
| Country | variable_name | Acute typhoid | Carrier | Healthy Control |
|---|---|---|---|---|
| Bangladesh | Median age | 6.0 | 37.0 | 28.5 |
| Bangladesh | Women (%) | 47.5 | 47.5 | 65.0 |
| Bangladesh | Antibiotics in last 2 weeks (%) | 37.5 | NA | NA |
| Malawi | Median age | 9.7 | 32.0 | 24.0 |
| Malawi | Women (%) | 64.7 | 57.5 | 65.0 |
| Malawi | Antibiotics in last 2 weeks (%) | 26.5 | NA | NA |
| Nepal | Median age | 17.2 | 43.9 | 35.0 |
| Nepal | Women (%) | 24.1 | 66.7 | 82.4 |
| Nepal | Antibiotics in last 2 weeks (%) | 44.8 | NA | NA |
ggplot(metadata, aes(x = Country, y = Age, fill = Group)) + geom_boxplot() + stat_compare_means(method = 'kruskal.test', label = "p")
comparisons_groups <- list(c("Acute typhoid", "Carrier"), c("Acute typhoid", "Healthy Control"), c("Carrier", "Healthy Control"))
comparisons_countries <- list(c('Bangladesh', 'Malawi'), c('Bangladesh', 'Nepal'), c('Malawi', 'Nepal'))
# default stat method when doing pairwise tests in wilcoxon.
ggboxplot(metadata, facet.by = "Country", y = "Age", x = "Group", color = "Group") + stat_compare_means(comparisons = comparisons_groups) + rremove("x.text") + rremove("xlab") + rremove("x.ticks") # +rotate_x_text(angle = 45)
ggboxplot(metadata, facet.by = "Group", y = "Age", x = "Country", color = "Country") + stat_compare_means(comparisons = comparisons_countries) + rotate_x_text(angle = 45)
country_group_sex <- metadata %>% group_by(Country, Group, Sex) %>% summarise(count = n())
plot_sex <- function(eg1, c){
d <- eg1 %>% filter(Country == c)
p <- ggplot(d, aes(x = Group, y = count, fill = Sex)) +
geom_bar(stat ='identity', position = 'fill') +
ylab('Proportion') +
ggtitle(c)# +
#theme(axis.text=element_text(size=34), axis.title=element_text(size=36,face="bold"), plot.title = element_text(size = 40, face = "bold"), legend.key.size = unit(4, 'cm'), legend.title = element_text(size = 34), legend.text = element_text(size = 28))
return(p)
}
m_sex <- plot_sex(country_group_sex, 'Malawi')
b_sex <- plot_sex(country_group_sex, 'Bangladesh')
n_sex <- plot_sex(country_group_sex, 'Nepal')
m_sex / b_sex / n_sex
strataa_metaphlan_data <- read.csv(file = file.path(metaphlan_input_folder, '2023.05.11.all_strataa_metaphlan.csv'), header= TRUE, sep = ",", row.names = 1, stringsAsFactors = FALSE, check.names=FALSE)
strataa_metaphlan_data$lowest_taxonomic_level <- sapply(str_split(row.names(strataa_metaphlan_data), "\\|"), function(x) x[length(x)])
strataa_metaphlan_metadata <- read.csv(file = file.path(metaphlan_input_folder, '2023.05.11.strataa_metadata.metaphlan.csv'), header = TRUE, sep = ",", row.names = 1, stringsAsFactors = FALSE)
strataa_metaphlan_metadata <- strataa_metaphlan_metadata %>% mutate(SampleID = rownames(strataa_metaphlan_metadata))
strataa_metaphlan_metadata <- strataa_metaphlan_metadata %>% mutate(age_bracket=cut(Age, breaks=c(0, 1, 5, 15, Inf), labels=c("<1", "1-5", "6-15", ">15")))
Alpha diversity - all countries, all groups
all_countries_all_groups_alpha <- metaphlan_alpha(strataa_metaphlan_data, strataa_metaphlan_metadata, countries_of_interest = c('Bangladesh', 'Malawi', 'Nepal'), groups_of_interest = c('Acute_Typhi', 'Carrier', 'Control_HealthySerosurvey'), comparisons = list(c('Acute_Typhi', 'Carrier'), c('Acute_Typhi', 'Control_HealthySerosurvey'), c('Control_HealthySerosurvey', 'Carrier')))
all_countries_all_groups_alpha$alpha_anova_summary_with_var_names %>% dplyr::mutate_if(is.numeric, funs(as.character(signif(., 3)))) %>% kbl() %>% kable_styling()
| rownames(alpha_anova_summary[[1]]) | Df | Sum.Sq | Mean.Sq | F.value | Pr..F. | is_it_significant |
|---|---|---|---|---|---|---|
| Country:Group | 4 | 7.83 | 1.96 | 12.4 | 3.09e-09 | significant |
| Group | 2 | 5.92 | 2.96 | 18.7 | 2.55e-08 | significant |
| Country | 2 | 2.75 | 1.38 | 8.7 | 0.00022 | significant |
| Country:Age:Antibiotics_taken_before_sampling_yes_no_assumptions | 2 | 1.83 | 0.913 | 5.77 | 0.00352 | significant |
| Country:Sex:Antibiotics_taken_before_sampling_yes_no_assumptions | 2 | 1.25 | 0.625 | 3.95 | 0.0204 | not_significant |
| Sex:Group | 2 | 1.13 | 0.566 | 3.58 | 0.0292 | not_significant |
| Country:Sex | 2 | 1.11 | 0.553 | 3.49 | 0.0318 | not_significant |
| Country:Sex:Age | 2 | 1.03 | 0.517 | 3.27 | 0.0396 | not_significant |
| Country:Antibiotics_taken_before_sampling_yes_no_assumptions | 2 | 0.835 | 0.417 | 2.64 | 0.0733 | not_significant |
| Country:Age | 2 | 0.576 | 0.288 | 1.82 | 0.164 | not_significant |
| Country:Group:Age | 4 | 1.03 | 0.256 | 1.62 | 0.169 | not_significant |
| Sex:Age | 1 | 0.298 | 0.298 | 1.88 | 0.171 | not_significant |
| Antibiotics_taken_before_sampling_yes_no_assumptions | 2 | 0.474 | 0.237 | 1.5 | 0.226 | not_significant |
| Country:Sex:Group | 4 | 0.817 | 0.204 | 1.29 | 0.274 | not_significant |
| Age:Antibiotics_taken_before_sampling_yes_no_assumptions | 2 | 0.318 | 0.159 | 1.01 | 0.367 | not_significant |
| Group:Age | 2 | 0.239 | 0.119 | 0.754 | 0.471 | not_significant |
| Sex:Age:Antibiotics_taken_before_sampling_yes_no_assumptions | 1 | 0.0529 | 0.0529 | 0.334 | 0.564 | not_significant |
| Sex:Antibiotics_taken_before_sampling_yes_no_assumptions | 1 | 0.0476 | 0.0476 | 0.301 | 0.584 | not_significant |
| Sex | 1 | 0.0294 | 0.0294 | 0.186 | 0.667 | not_significant |
| Sex:Group:Age | 2 | 0.051 | 0.0255 | 0.161 | 0.851 | not_significant |
| Country:Sex:Group:Age | 4 | 0.127 | 0.0317 | 0.201 | 0.938 | not_significant |
| Country:Sex:Age:Antibiotics_taken_before_sampling_yes_no_assumptions | 1 | 0.000136 | 0.000136 | 0.000861 | 0.977 | not_significant |
| Age | 1 | 1.07e-08 | 1.07e-08 | 6.75e-08 | 1 | not_significant |
| Residuals | 261 | 41.3 | 0.158 | NA | NA | NA |
all_countries_all_groups_alpha$alpha_plot_group
all_countries_all_groups_alpha$alpha_plot_antibiotics
Alpha diversity - all countries, healthy and acute
all_countries_healthy_acute_alpha <- metaphlan_alpha(strataa_metaphlan_data, strataa_metaphlan_metadata, countries_of_interest = c('Bangladesh', 'Malawi', 'Nepal'), groups_of_interest = c('Acute_Typhi', 'Control_HealthySerosurvey'), comparisons = list(c('Acute_Typhi', 'Control_HealthySerosurvey')))
all_countries_healthy_acute_alpha$alpha_by_country
all_countries_healthy_acute_alpha$alpha_anova_summary_with_var_names %>% dplyr::mutate_if(is.numeric, funs(as.character(signif(., 3)))) %>% kbl() %>% kable_styling()
| rownames(alpha_anova_summary[[1]]) | Df | Sum.Sq | Mean.Sq | F.value | Pr..F. | is_it_significant |
|---|---|---|---|---|---|---|
| Country | 2 | 5.74 | 2.87 | 17.4 | 1.36e-07 | significant |
| Country:Age:Antibiotics_taken_before_sampling_yes_no_assumptions | 2 | 1.68 | 0.842 | 5.12 | 0.00699 | significant |
| Sex:Age | 1 | 1.07 | 1.07 | 6.5 | 0.0117 | not_significant |
| Country:Sex | 2 | 1.5 | 0.75 | 4.56 | 0.0119 | not_significant |
| Country:Sex:Antibiotics_taken_before_sampling_yes_no_assumptions | 2 | 1.39 | 0.693 | 4.21 | 0.0165 | not_significant |
| Country:Age | 2 | 1.02 | 0.51 | 3.1 | 0.0477 | not_significant |
| Sex:Group | 1 | 0.628 | 0.628 | 3.82 | 0.0525 | not_significant |
| Country:Antibiotics_taken_before_sampling_yes_no_assumptions | 2 | 0.816 | 0.408 | 2.48 | 0.0871 | not_significant |
| Country:Group | 2 | 0.698 | 0.349 | 2.12 | 0.123 | not_significant |
| Antibiotics_taken_before_sampling_yes_no_assumptions | 2 | 0.618 | 0.309 | 1.88 | 0.157 | not_significant |
| Group | 1 | 0.164 | 0.164 | 0.997 | 0.319 | not_significant |
| Age:Antibiotics_taken_before_sampling_yes_no_assumptions | 2 | 0.359 | 0.179 | 1.09 | 0.339 | not_significant |
| Sex | 1 | 0.0592 | 0.0592 | 0.36 | 0.55 | not_significant |
| Country:Sex:Age | 2 | 0.188 | 0.0938 | 0.57 | 0.567 | not_significant |
| Sex:Age:Antibiotics_taken_before_sampling_yes_no_assumptions | 1 | 0.0511 | 0.0511 | 0.31 | 0.578 | not_significant |
| Sex:Group:Age | 1 | 0.0368 | 0.0368 | 0.224 | 0.637 | not_significant |
| Group:Age | 1 | 0.0147 | 0.0147 | 0.0891 | 0.766 | not_significant |
| Country:Group:Age | 2 | 0.0713 | 0.0357 | 0.217 | 0.805 | not_significant |
| Age | 1 | 0.00772 | 0.00772 | 0.0469 | 0.829 | not_significant |
| Country:Sex:Group | 2 | 0.0595 | 0.0297 | 0.181 | 0.835 | not_significant |
| Sex:Antibiotics_taken_before_sampling_yes_no_assumptions | 1 | 0.00675 | 0.00675 | 0.041 | 0.84 | not_significant |
| Country:Sex:Group:Age | 2 | 0.05 | 0.025 | 0.152 | 0.859 | not_significant |
| Country:Sex:Age:Antibiotics_taken_before_sampling_yes_no_assumptions | 1 | 0.000136 | 0.000136 | 0.000828 | 0.977 | not_significant |
| Residuals | 163 | 26.8 | 0.165 | NA | NA | NA |
all_countries_healthy_acute_alpha$alpha_plot_group
all_countries_healthy_acute_alpha$alpha_plot_antibiotics
alpha diversity, malawi only
malawi_healthy_acute_alpha <- metaphlan_alpha(strataa_metaphlan_data, strataa_metaphlan_metadata, countries_of_interest = c('Malawi'), groups_of_interest = c('Acute_Typhi', 'Control_HealthySerosurvey'), comparisons = list(c('Acute_Typhi', 'Control_HealthySerosurvey')))
malawi_healthy_acute_alpha$alpha_anova_summary_with_var_names %>% dplyr::mutate_if(is.numeric, funs(as.character(signif(., 3)))) %>% kbl() %>% kable_styling()
| rownames(alpha_anova_summary[[1]]) | Df | Sum.Sq | Mean.Sq | F.value | Pr..F. | is_it_significant |
|---|---|---|---|---|---|---|
| Age:Antibiotics_taken_before_sampling_yes_no_assumptions | 1 | 2.27 | 2.27 | 16 | 0.000167 | significant |
| Antibiotics_taken_before_sampling_yes_no_assumptions | 1 | 1.31 | 1.31 | 9.27 | 0.0034 | significant |
| Sex | 1 | 1.16 | 1.16 | 8.21 | 0.00567 | significant |
| Group | 1 | 0.84 | 0.84 | 5.93 | 0.0177 | not_significant |
| Age | 1 | 0.451 | 0.451 | 3.19 | 0.079 | not_significant |
| Sex:Antibiotics_taken_before_sampling_yes_no_assumptions | 1 | 0.406 | 0.406 | 2.87 | 0.0952 | not_significant |
| Sex:Group | 1 | 0.313 | 0.313 | 2.21 | 0.142 | not_significant |
| Sex:Group:Age | 1 | 0.0978 | 0.0978 | 0.691 | 0.409 | not_significant |
| Sex:Age | 1 | 0.0164 | 0.0164 | 0.116 | 0.734 | not_significant |
| Group:Age | 1 | 0.00122 | 0.00122 | 0.00864 | 0.926 | not_significant |
| Residuals | 63 | 8.91 | 0.141 | NA | NA | NA |
malawi_healthy_acute_alpha$alpha_plot_group
malawi_healthy_acute_alpha$alpha_plot_antibiotics
All groups.
all_countries_beta_all_groups <- metaphlan_beta(strataa_metaphlan_data, strataa_metaphlan_metadata, c('Bangladesh', 'Malawi', 'Nepal'), c('Acute_Typhi', 'Carrier', 'Control_HealthySerosurvey'))
all_countries_beta_all_groups$pn_res %>% kbl %>% kable_styling()
| R2 | Pr(>F) | is_it_significant | |
|---|---|---|---|
| Group | 0.0759082 | 0.0009990 | significant |
| Group:Age | 0.0121511 | 0.0059940 | significant |
| Sex:Group:Age | 0.0081783 | 0.1028971 | not_significant |
| Sex:Group | 0.0079033 | 0.1058941 | not_significant |
| Antibiotics_taken_before_sampling_yes_no_assumptions | 0.0076947 | 0.1358641 | not_significant |
| Age | 0.0038070 | 0.2057942 | not_significant |
| Sex:Antibiotics_taken_before_sampling_yes_no_assumptions | 0.0035541 | 0.2257742 | not_significant |
| Sex:Age | 0.0035425 | 0.2407592 | not_significant |
| Sex | 0.0035891 | 0.2417582 | not_significant |
| Sex:Age:Antibiotics_taken_before_sampling_yes_no_assumptions | 0.0027831 | 0.4825175 | not_significant |
| Age:Antibiotics_taken_before_sampling_yes_no_assumptions | 0.0045854 | 0.7812188 | not_significant |
| Total | 1.0000000 | NA | NA |
| Residual | 0.8663032 | NA | NA |
bgd_beta_all_groups <- metaphlan_beta(strataa_metaphlan_data, strataa_metaphlan_metadata, c('Bangladesh'), c('Acute_Typhi', 'Carrier', 'Control_HealthySerosurvey'))
bgd_beta_all_groups$pn_res %>% kbl %>% kable_styling()
| R2 | Pr(>F) | is_it_significant | |
|---|---|---|---|
| Group | 0.2679584 | 0.0009990 | significant |
| Antibiotics_taken_before_sampling_yes_no_assumptions | 0.0124851 | 0.0369630 | not_significant |
| Sex:Age | 0.0093987 | 0.1308691 | not_significant |
| Age | 0.0084979 | 0.1678322 | not_significant |
| Sex:Group | 0.0155821 | 0.2037962 | not_significant |
| Sex | 0.0078067 | 0.2157842 | not_significant |
| Group:Age | 0.0115137 | 0.5164835 | not_significant |
| Sex:Group:Age | 0.0104905 | 0.6363636 | not_significant |
| Age:Antibiotics_taken_before_sampling_yes_no_assumptions | 0.0047215 | 0.6893107 | not_significant |
| Sex:Antibiotics_taken_before_sampling_yes_no_assumptions | 0.0044103 | 0.6933067 | not_significant |
| Sex:Age:Antibiotics_taken_before_sampling_yes_no_assumptions | 0.0035306 | 0.8511489 | not_significant |
| Total | 1.0000000 | NA | NA |
| Residual | 0.6436043 | NA | NA |
# bgd_beta_all_groups$pc12
# bgd_beta_all_groups$pc34
mal_beta_all_groups <- metaphlan_beta(strataa_metaphlan_data, strataa_metaphlan_metadata, c('Malawi'), c('Acute_Typhi', 'Carrier', 'Control_HealthySerosurvey'))
mal_beta_all_groups$pn_res %>% kbl %>% kable_styling()
| R2 | Pr(>F) | is_it_significant | |
|---|---|---|---|
| Group | 0.1942771 | 0.0009990 | significant |
| Antibiotics_taken_before_sampling_yes_no_assumptions | 0.0247209 | 0.0009990 | significant |
| Age:Antibiotics_taken_before_sampling_yes_no_assumptions | 0.0225358 | 0.0009990 | significant |
| Sex:Antibiotics_taken_before_sampling_yes_no_assumptions | 0.0169857 | 0.0069930 | significant |
| Sex:Age | 0.0138711 | 0.0149850 | not_significant |
| Sex:Group | 0.0217097 | 0.0229770 | not_significant |
| Group:Age | 0.0223640 | 0.0289710 | not_significant |
| Age | 0.0114622 | 0.0459540 | not_significant |
| Sex | 0.0075054 | 0.2767233 | not_significant |
| Sex:Group:Age | 0.0129653 | 0.4505495 | not_significant |
| Total | 1.0000000 | NA | NA |
| Residual | 0.6516027 | NA | NA |
# mal_beta_all_groups$pc12
# mal_beta_all_groups$pc34
nep_beta_all_groups <- metaphlan_beta(strataa_metaphlan_data, strataa_metaphlan_metadata, c('Nepal'), c('Acute_Typhi', 'Carrier', 'Control_HealthySerosurvey'))
nep_beta_all_groups$pn_res %>% kbl %>% kable_styling()
| R2 | Pr(>F) | is_it_significant | |
|---|---|---|---|
| Group | 0.0898195 | 0.0009990 | significant |
| Sex | 0.0176933 | 0.1438561 | not_significant |
| Sex:Age | 0.0148693 | 0.2607393 | not_significant |
| Age | 0.0125478 | 0.4615385 | not_significant |
| Sex:Age:Antibiotics_taken_before_sampling_yes_no_assumptions | 0.0124906 | 0.4785215 | not_significant |
| Sex:Group | 0.0239383 | 0.5744256 | not_significant |
| Antibiotics_taken_before_sampling_yes_no_assumptions | 0.0228450 | 0.6333666 | not_significant |
| Sex:Antibiotics_taken_before_sampling_yes_no_assumptions | 0.0106433 | 0.6473526 | not_significant |
| Sex:Group:Age | 0.0169339 | 0.9190809 | not_significant |
| Group:Age | 0.0162360 | 0.9620380 | not_significant |
| Age:Antibiotics_taken_before_sampling_yes_no_assumptions | 0.0102948 | 0.9980020 | not_significant |
| Total | 1.0000000 | NA | NA |
| Residual | 0.7516884 | NA | NA |
all_countries_beta_all_groups$pc12 / bgd_beta_all_groups$pc12 / nep_beta_all_groups$pc12
Acute vs healthy.
all_countries_beta_acute_healthy <- metaphlan_beta(strataa_metaphlan_data, strataa_metaphlan_metadata, c('Bangladesh', 'Malawi', 'Nepal'), c('Acute_Typhi', 'Control_HealthySerosurvey'))
all_countries_beta_acute_healthy$pn_res %>% kbl %>% kable_styling()
| R2 | Pr(>F) | is_it_significant | |
|---|---|---|---|
| Group | 0.0285252 | 0.0009990 | significant |
| Group:Age | 0.0117687 | 0.0149850 | not_significant |
| Age | 0.0092014 | 0.0369630 | not_significant |
| Sex:Group:Age | 0.0072077 | 0.1048951 | not_significant |
| Sex | 0.0070879 | 0.1108891 | not_significant |
| Antibiotics_taken_before_sampling_yes_no_assumptions | 0.0117438 | 0.2027972 | not_significant |
| Sex:Age | 0.0058394 | 0.2297702 | not_significant |
| Sex:Group | 0.0057923 | 0.2567433 | not_significant |
| Sex:Antibiotics_taken_before_sampling_yes_no_assumptions | 0.0055130 | 0.2977023 | not_significant |
| Sex:Age:Antibiotics_taken_before_sampling_yes_no_assumptions | 0.0043096 | 0.4825175 | not_significant |
| Age:Antibiotics_taken_before_sampling_yes_no_assumptions | 0.0070480 | 0.8291708 | not_significant |
| Total | 1.0000000 | NA | NA |
| Residual | 0.8959630 | NA | NA |
bgd_beta_acute_healthy <- metaphlan_beta(strataa_metaphlan_data, strataa_metaphlan_metadata, c('Bangladesh'), c('Acute_Typhi', 'Control_HealthySerosurvey'))
bgd_beta_acute_healthy$pn_res %>% kbl %>% kable_styling()
| R2 | Pr(>F) | is_it_significant | |
|---|---|---|---|
| Group | 0.0782718 | 0.0009990 | significant |
| Sex | 0.0230592 | 0.0399600 | not_significant |
| Antibiotics_taken_before_sampling_yes_no_assumptions | 0.0199564 | 0.0799201 | not_significant |
| Sex:Age | 0.0168558 | 0.1288711 | not_significant |
| Age | 0.0146689 | 0.2217782 | not_significant |
| Group:Age | 0.0109377 | 0.5044955 | not_significant |
| Sex:Group:Age | 0.0100500 | 0.5704296 | not_significant |
| Age:Antibiotics_taken_before_sampling_yes_no_assumptions | 0.0080933 | 0.7512488 | not_significant |
| Sex:Antibiotics_taken_before_sampling_yes_no_assumptions | 0.0074906 | 0.8341658 | not_significant |
| Sex:Group | 0.0067483 | 0.8841159 | not_significant |
| Sex:Age:Antibiotics_taken_before_sampling_yes_no_assumptions | 0.0059228 | 0.9240759 | not_significant |
| Total | 1.0000000 | NA | NA |
| Residual | 0.7979452 | NA | NA |
mal_beta_acute_healthy <- metaphlan_beta(strataa_metaphlan_data, strataa_metaphlan_metadata, c('Malawi'), c('Acute_Typhi', 'Control_HealthySerosurvey'))
mal_beta_acute_healthy$pn_res %>% kbl %>% kable_styling()
| R2 | Pr(>F) | is_it_significant | |
|---|---|---|---|
| Group | 0.1995900 | 0.0009990 | significant |
| Age:Antibiotics_taken_before_sampling_yes_no_assumptions | 0.0371210 | 0.0009990 | significant |
| Antibiotics_taken_before_sampling_yes_no_assumptions | 0.0381931 | 0.0029970 | significant |
| Sex:Antibiotics_taken_before_sampling_yes_no_assumptions | 0.0267524 | 0.0069930 | significant |
| Sex:Group | 0.0200496 | 0.0269730 | not_significant |
| Age | 0.0194908 | 0.0309690 | not_significant |
| Sex:Age | 0.0144797 | 0.1348651 | not_significant |
| Sex | 0.0141068 | 0.1448551 | not_significant |
| Sex:Group:Age | 0.0091754 | 0.4585415 | not_significant |
| Group:Age | 0.0078947 | 0.5954046 | not_significant |
| Total | 1.0000000 | NA | NA |
| Residual | 0.6131466 | NA | NA |
nep_beta_acute_healthy <- metaphlan_beta(strataa_metaphlan_data, strataa_metaphlan_metadata, c('Nepal'), c('Acute_Typhi', 'Control_HealthySerosurvey'))
nep_beta_acute_healthy$pn_res %>% kbl %>% kable_styling()
| R2 | Pr(>F) | is_it_significant | |
|---|---|---|---|
| Group | 0.0598990 | 0.0039960 | significant |
| Sex | 0.0484279 | 0.0089910 | significant |
| Sex:Group | 0.0246755 | 0.2917083 | not_significant |
| Sex:Age:Antibiotics_taken_before_sampling_yes_no_assumptions | 0.0244159 | 0.3356643 | not_significant |
| Antibiotics_taken_before_sampling_yes_no_assumptions | 0.0450942 | 0.4025974 | not_significant |
| Sex:Antibiotics_taken_before_sampling_yes_no_assumptions | 0.0207951 | 0.5044955 | not_significant |
| Sex:Group:Age | 0.0175717 | 0.6853147 | not_significant |
| Age | 0.0154038 | 0.8171828 | not_significant |
| Sex:Age | 0.0148604 | 0.8351648 | not_significant |
| Group:Age | 0.0102689 | 0.9800200 | not_significant |
| Age:Antibiotics_taken_before_sampling_yes_no_assumptions | 0.0188359 | 0.9960040 | not_significant |
| Total | 1.0000000 | NA | NA |
| Residual | 0.6997518 | NA | NA |
all_countries_beta_acute_healthy$pc12 + bgd_beta_acute_healthy$pc12 + mal_beta_acute_healthy$pc12 + nep_beta_acute_healthy$pc12
# get the taxa that are phyla, not classes, or below (species etc), and tidy the data.
phyla <- strataa_metaphlan_data %>% mutate(clade_name = rownames(strataa_metaphlan_data)) %>% filter(grepl("p__", clade_name)) %>% filter(!grepl("c__", clade_name)) %>% pivot_longer(!c(clade_name, lowest_taxonomic_level), names_to = "sample", values_to = "relative_abundance")
# relative_abundance > 1 returns a list of TRUE/FALSE values, which is then summed to get the number of samples in which the phylum is present at > 1% relative abundance.
# then we filter to only keep phyla that are present at 1% in at least 10% of samples.
phyla_to_exclude <- phyla %>% group_by(clade_name) %>%
summarise(count = sum(relative_abundance > 1)) %>%
filter(count < (length(unique(phyla$sample)) / 10)) %>%
pull(clade_name)
View(phyla_to_exclude)
# in order to make each sample add up to 100, we need to add the excluded taxa back in as a single "rare taxa" phylum.
# first we need to calculate the relative abundance of the excluded taxa in each sample.
excluded_phyla <- phyla %>%
filter(clade_name %in% phyla_to_exclude) %>% group_by(sample) %>% summarise(relative_abundance = sum(relative_abundance))
# then make a column with the name "rare_taxa" for each sample, annd bind it to the excluded taxa data.
rare_taxa_column <- data.frame(lowest_taxonomic_level = c(rep("rare_taxa", nrow(excluded_phyla))), clade_name = c(rep("rare_taxa", nrow(excluded_phyla))))
excluded_phyla <- cbind(rare_taxa_column, excluded_phyla)
# then remove the excluded taxa from the phyla data.
phyla_clean <- phyla %>%
filter(!(clade_name %in% phyla_to_exclude))
# and add the excluded taxa back in.
phyla_clean <- rbind(phyla_clean, excluded_phyla)
View(phyla_clean)
View(excluded_phyla)
colnames(strataa_metaphlan_metadata)
## [1] "Group"
## [2] "Sex"
## [3] "Country"
## [4] "Age"
## [5] "Antibiotics_taken_before_sampling_yes_no_assumptions"
## [6] "sequencing_lane"
## [7] "SampleID"
## [8] "age_bracket"
metadata_select <- strataa_metaphlan_metadata %>% dplyr::select(SampleID, Group, Country)
phyla_clean_metadata <- phyla_clean %>% left_join(metadata_select, by = c("sample" = "SampleID"))
phyla_clean_metadata <- phyla_clean_metadata %>% mutate(Group = ifelse(Group == "Acute_Typhi", "Typhi", Group)) %>% mutate(Group = ifelse(Group == "Control_HealthySerosurvey", "Healthy", Group))
order_of_groups <- c("Typhi", "Healthy")
plot_per_country_abundance <- function(phyla_clean_metadata, country, group_order){
# thanks chatgpt!
phyla_clean_country <- phyla_clean_metadata %>% filter(Country == country) %>% filter(Group %in% group_order)
phyla_clean_country_fct <- phyla_clean_country %>%
mutate(Group = factor(Group, levels = group_order), # Convert group to a factor with the desired order
group_order_numeric = as.numeric(Group), # Create a new numeric variable based on the order of group
sample = fct_reorder(sample, group_order_numeric)) # Reorder sample based on group_order_numeric
p <- ggplot(data = phyla_clean_country_fct, aes(x = sample, y = relative_abundance, fill = clade_name)) +
geom_bar(stat = "identity") +
theme(axis.text.x = element_text(angle = 90, hjust = 1), axis.title.x = element_blank()) +
scale_x_discrete(breaks=phyla_clean_country_fct$sample, labels=phyla_clean_country_fct$Group) +
ggtitle(country) +
ylim(0, 100.01)
p
}
bangladesh_phyla_plot <- plot_per_country_abundance(phyla_clean_metadata = phyla_clean_metadata, country = "Bangladesh", group_order = order_of_groups)
# bangladesh_phyla_plot
malawi_phyla_plot <- plot_per_country_abundance(phyla_clean_metadata = phyla_clean_metadata, country = "Malawi", group_order = order_of_groups)
nepal_phyla_plot <- plot_per_country_abundance(phyla_clean_metadata = phyla_clean_metadata, country = "Nepal", group_order = order_of_groups)
bangladesh_phyla_plot / malawi_phyla_plot / nepal_phyla_plot
# going back to phyla so that get all the weird rare ones.
# need to join to meta-data
# remove carriers
# group by clade, group, country
# summarise to get the median.
# pivot wider to get the table.
# dplyr::select(order(colnames(.))) rearranges the columns in alphabetical order
# relocate moves the clade_name column to the first column.
phyla %>% left_join(metadata_select, by = c("sample" = "SampleID")) %>% filter(Group != 'Carrier') %>% group_by(clade_name, Group, Country) %>% summarise(median_prevalence = median(relative_abundance)) %>% pivot_wider(names_from = c('Country', 'Group'), values_from = 'median_prevalence') %>% dplyr::select(order(colnames(.))) %>% relocate(clade_name, .before = 1) %>% kbl() %>% kable_styling()
| clade_name | Bangladesh_Acute_Typhi | Bangladesh_Control_HealthySerosurvey | Malawi_Acute_Typhi | Malawi_Control_HealthySerosurvey | Nepal_Acute_Typhi | Nepal_Control_HealthySerosurvey |
|---|---|---|---|---|---|---|
| k__Archaea|p__Candidatus_Thermoplasmatota | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.00000 | 0.00000 |
| k__Archaea|p__Euryarchaeota | 0.000000 | 0.000000 | 0.249010 | 0.000000 | 0.00000 | 0.22474 |
| k__Archaea|p__Thaumarchaeota | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.00000 | 0.00000 |
| k__Bacteria|p__Acidobacteria | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.00000 | 0.00000 |
| k__Bacteria|p__Actinobacteria | 14.361185 | 13.136440 | 1.893725 | 0.524965 | 3.33634 | 3.81550 |
| k__Bacteria|p__Bacteria_unclassified | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.00000 | 0.00000 |
| k__Bacteria|p__Bacteroidetes | 0.911970 | 2.216730 | 15.361165 | 34.630185 | 28.24679 | 20.53695 |
| k__Bacteria|p__Candidatus_Melainabacteria | 0.000000 | 0.000000 | 0.038085 | 0.052210 | 0.00000 | 0.02157 |
| k__Bacteria|p__Candidatus_Saccharibacteria | 0.017310 | 0.015625 | 0.000000 | 0.000000 | 0.00000 | 0.00203 |
| k__Bacteria|p__Chloroflexi | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.00000 | 0.00000 |
| k__Bacteria|p__Elusimicrobia | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.00000 | 0.00000 |
| k__Bacteria|p__Firmicutes | 80.670075 | 72.161565 | 60.161350 | 58.311770 | 53.74090 | 60.17250 |
| k__Bacteria|p__Fusobacteria | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.00000 | 0.00000 |
| k__Bacteria|p__Lentisphaerae | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.00000 | 0.00000 |
| k__Bacteria|p__Proteobacteria | 0.629715 | 1.272295 | 6.606440 | 6.195335 | 2.90174 | 2.52509 |
| k__Bacteria|p__Spirochaetes | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.00000 | 0.00000 |
| k__Bacteria|p__Synergistetes | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.00000 | 0.00000 |
| k__Bacteria|p__Tenericutes | 0.000000 | 0.000000 | 0.002715 | 0.000000 | 0.01965 | 0.01362 |
| k__Bacteria|p__Verrucomicrobia | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.00000 | 0.00000 |
| k__Eukaryota|p__Ascomycota | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.00000 | 0.00000 |
| k__Eukaryota|p__Eukaryota_unclassified | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.00000 | 0.00000 |
Maaslin basics
groups_to_analyse <- c('Acute_Typhi', 'Control_HealthySerosurvey')
bang_variables_for_analysis <- c("Group", "Sex", "Age", "Antibiotics_taken_before_sampling_yes_no_assumptions")
mwi_variables_for_analysis <- c("Group", "Sex", "Age", "Antibiotics_taken_before_sampling_yes_no_assumptions", "sequencing_lane")
nep_variables_for_analysis <- c("Group", "Sex", "Age", "Antibiotics_taken_before_sampling_yes_no_assumptions")
bangladesh_taxonomic_maaslin <- read_in_maaslin('Bangladesh', groups_to_analyse, bang_variables_for_analysis)
malawi_taxonomic_maaslin <- read_in_maaslin('Malawi', groups_to_analyse, mwi_variables_for_analysis)
nepal_taxonomic_maaslin <- read_in_maaslin('Nepal', groups_to_analyse, nep_variables_for_analysis)
bangladesh_taxonomic_maaslin_filtered <- filter_taxonomic_maaslin(bangladesh_taxonomic_maaslin)
malawi_taxonomic_maaslin_filtered <- filter_taxonomic_maaslin(malawi_taxonomic_maaslin)
nepal_taxonomic_maaslin_filtered <- filter_taxonomic_maaslin(nepal_taxonomic_maaslin)
bangladesh_maaslin_stats <- basic_maaslin_stats(bangladesh_taxonomic_maaslin_filtered, 'Bangladesh', bang_variables_for_analysis, groups_to_analyse)
malawi_maaslin_stats <- basic_maaslin_stats(malawi_taxonomic_maaslin_filtered, 'Malawi', mwi_variables_for_analysis, groups_to_analyse)
nepal_maaslin_stats <- basic_maaslin_stats(nepal_taxonomic_maaslin_filtered, 'Nepal', nep_variables_for_analysis, groups_to_analyse)
malawi_maaslin_stats$volcano_plot / bangladesh_maaslin_stats$volcano_plot / nepal_maaslin_stats$volcano_plot
nrow(malawi_maaslin_stats$maaslin_results_sig)
## [1] 296
nrow(bangladesh_maaslin_stats$maaslin_results_sig)
## [1] 23
nrow(nepal_maaslin_stats$maaslin_results_sig)
## [1] 0
There were 296 species significantly (q-val < 0.05) associated with health/disease in Malawi, in Bangladesh, and in Nepal.
Combine the taxonomic maaslins, and print out the species that are sig in both and share directions.
Because sequencing run and participant type are totally confounded for Bangladesh, need to exclude sequencing run from the final model for Bangladesh (otherwise, wipes out the signals).
between acute <-> health.
###Â associated at both sites
combined_results <- run_combine_maaslins(bangladesh_taxonomic_maaslin_filtered, malawi_taxonomic_maaslin_filtered, bang_variables_for_analysis, mwi_variables_for_analysis, groups_to_analyse, 'metaphlan', maaslin_taxonomic_output_root_folder)
combined_results$positive_coef %>% filter(grepl('^s', lowest_taxonomic_level)) %>%
select(!c(metadata, value, N_bang, N.not.0_bang, pval_bang, N_mal, N.not.0_mal, pval_mal)) %>%
rename(Species = lowest_taxonomic_level, `Coefficient Bangladesh` = coef_bang, `Standard Error Bangladesh` = stderr_bang, `Q-value Bangladesh` = qval_bang, `Coefficient Malawi` = coef_mal, `Standard Error Malawi` = stderr_mal, `Q-value Malawi` = qval_mal) %>%
dplyr::mutate_if(is.numeric, funs(as.character(signif(., 3)))) %>%
kbl() %>% kable_styling()
| feature | Species | Coefficient Bangladesh | Standard Error Bangladesh | Q-value Bangladesh | Coefficient Malawi | Standard Error Malawi | Q-value Malawi |
|---|---|---|---|---|---|---|---|
| k__Bacteria.p__Firmicutes.c__Clostridia.o__Eubacteriales.f__Clostridiaceae.g__Clostridium.s__Clostridium_SGB6179 | s__Clostridium_SGB6179 | 8.79 | 1.34 | 4.25e-05 | 3.11 | 0.936 | 0.0286 |
| k__Bacteria.p__Bacteroidetes.c__Bacteroidia.o__Bacteroidales.f__Prevotellaceae.g__Prevotella.s__Prevotella_copri_clade_A | s__Prevotella_copri_clade_A | 4.54 | 0.978 | 0.00971 | 7.64 | 1.25 | 1.21e-05 |
| k__Bacteria.p__Firmicutes.c__Negativicutes.o__Veillonellales.f__Veillonellaceae.g__GGB4266.s__GGB4266_SGB5809 | s__GGB4266_SGB5809 | 4.58 | 1.09 | 0.0147 | 6.94 | 0.981 | 4.38e-07 |
| k__Bacteria.p__Proteobacteria.c__Gammaproteobacteria.o__Pasteurellales.f__Pasteurellaceae.g__Haemophilus.s__Haemophilus_parainfluenzae | s__Haemophilus_parainfluenzae | 3.64 | 0.979 | 0.03 | 6.92 | 0.953 | 2.26e-07 |
| k__Bacteria.p__Firmicutes.c__Clostridia.o__Eubacteriales.f__Peptostreptococcaceae.g__Romboutsia.s__Romboutsia_timonensis | s__Romboutsia_timonensis | 4.52 | 1.25 | 0.0386 | 5.07 | 0.903 | 5.91e-05 |
combined_results$negative_coef %>% filter(grepl('^s', lowest_taxonomic_level)) %>%
select(!c(metadata, value, N_bang, N.not.0_bang, pval_bang, N_mal, N.not.0_mal, pval_mal)) %>%
rename(Species = lowest_taxonomic_level, `Coefficient Bangladesh` = coef_bang, `Standard Error Bangladesh` = stderr_bang, `Q-value Bangladesh` = qval_bang, `Coefficient Malawi` = coef_mal, `Standard Error Malawi` = stderr_mal, `Q-value Malawi` = qval_mal) %>%
dplyr::mutate_if(is.numeric, funs(as.character(signif(., 3)))) %>%
kbl() %>% kable_styling()
| feature | Species | Coefficient Bangladesh | Standard Error Bangladesh | Q-value Bangladesh | Coefficient Malawi | Standard Error Malawi | Q-value Malawi |
|---|---|---|---|---|---|---|---|
| k__Bacteria.p__Firmicutes.c__Clostridia.o__Eubacteriales.f__Lachnospiraceae.g__Lachnospiraceae_unclassified.s__Lachnospiraceae_bacterium | s__Lachnospiraceae_bacterium | -3.05 | 0.838 | 0.0366 | -2.87 | 0.785 | 0.0135 |
nrow(combined_results$mwi_maaslin_only)
## [1] 290
nrow(combined_results$bang_maaslin_only)
## [1] 17
There were 290 species significantly (q-val < 0.05) associated with health/disease in Malawi only and 17 in Bangladesh only.
###Â associated at only one site
The ones associated at only one site are written out to a file, you can look at them manually there.
strataa_metaphlan_data_longer <- strataa_metaphlan_data %>% mutate(feature = rownames(strataa_metaphlan_data)) %>% pivot_longer(!c(feature, lowest_taxonomic_level), names_to = "SampleID", values_to = "prevalence")
strataa_metaphlan_data_longer_meta <- strataa_metaphlan_data_longer %>% left_join(strataa_metaphlan_metadata, by = c("SampleID" = "SampleID"))
pc <- run_plot_species_of_interest(strataa_metaphlan_data_longer_meta, 's__Prevotella_copri_clade_A')
cs <- run_plot_species_of_interest(strataa_metaphlan_data_longer_meta, 's__Clostridium_SGB6179')
SGB5809 <-run_plot_species_of_interest(strataa_metaphlan_data_longer_meta, 's__GGB4266_SGB5809')
hp <- run_plot_species_of_interest(strataa_metaphlan_data_longer_meta, 's__Haemophilus_parainfluenzae')
rt <- run_plot_species_of_interest(strataa_metaphlan_data_longer_meta, 's__Romboutsia_timonensis')
lb <- run_plot_species_of_interest(strataa_metaphlan_data_longer_meta, 's__Lachnospiraceae_bacterium')
# pc + cs + SGB5809 + hp + rt + lb
Print the results for all the P. copri clades
all_taxa_maaslin <- combined_maaslins$all_features
p_copri_maaslin <- all_taxa_maaslin %>% filter(grepl('_Prevotella_copri_', feature)) %>% filter(qval_bang < 0.05 | qval_mal < 0.05)
write_csv(p_copri_maaslin, file.path(maaslin_taxonomic_output_root_folder, 'combined_maaslins_p_copri.csv'))
Functional groups associated with health
# This needs to be replaced by combine_maaslins
variables_for_analysis <- c("Group", "Gender", "Age", "Antibiotics_taken_before_sampling_yes_no_assumptions")
vars_for_dirname <- paste(variables_for_analysis, collapse = '.')
groups_to_analyse <- c('Acute_Typhi', 'Control_Healthy')
bang_maaslin_output_dir <- file.path(maaslin_functional_output_root_folder, paste('Bangladesh', paste(groups_to_analyse, collapse = '_vs_'), vars_for_dirname, sep = '_'))
malawi_maaslin_output_dir <- file.path(maaslin_functional_output_root_folder, paste('Malawi', paste(groups_to_analyse, collapse = '_vs_'), vars_for_dirname, sep = '_'))
bang_maaslin <- read_delim(file.path(bang_maaslin_output_dir, "all_results.tsv"), delim = "\t", escape_double = FALSE, trim_ws = TRUE)
malawi_maaslin <- read_delim(file.path(malawi_maaslin_output_dir, "all_results.tsv"), delim = "\t", escape_double = FALSE, trim_ws = TRUE)
combined_maaslins <- combine_maaslins(bang_maaslin, malawi_maaslin)
combined_maaslins_positive_coef <- combined_maaslins$positive_coef
combined_maaslins_negative_coef <- combined_maaslins$negative_coef
# hacky as hell thing to split the feature name to extract the organism and the enzyme class.
# because separate isn't working with .. or \\.\\. which is the actual separator used.
combined_maaslins_positive_coef <- combined_maaslins_positive_coef %>% separate_wider_delim(feature, delim = 'Entryname.', names_sep = '', cols_remove = FALSE) %>% separate_wider_delim(feature2, delim = '..OS.', names_sep = '') %>% separate_wider_delim(feature22, delim = '..SMASH', names_sep = '') %>% select(!c(feature1, feature222)) %>% rename(MGC_class = feature21, Species = feature221, feature = featurefeature)
combined_maaslins_negative_coef <- combined_maaslins_negative_coef %>% separate_wider_delim(feature, delim = 'Entryname.', names_sep = '', cols_remove = FALSE) %>% separate_wider_delim(feature2, delim = '..OS.', names_sep = '') %>% separate_wider_delim(feature22, delim = '..SMASH', names_sep = '') %>% select(!c(feature1, feature222)) %>% rename(MGC_class = feature21, Species = feature221, feature = featurefeature)
combined_maaslins_positive_coef %>% kbl() %>% kable_styling()
combined_maaslins_negative_coef %>% kbl() %>% kable_styling()
write_csv(combined_maaslins_positive_coef, file.path(maaslin_functional_output_root_folder, 'combined_maaslins_positive_coef.csv'))
combined_maaslins_negative_coef %>% kbl() %>% kable_styling()
write_csv(combined_maaslins_negative_coef, file.path(maaslin_functional_output_root_folder, 'combined_maaslins_negative_coef.csv'))
groups_to_analyse <- c('Carrier', 'Control_HealthySerosurvey')
mwi_variables_for_analysis <- c("Group", "Sex", "Age")
bang_variables_for_analysis <- c("Group", "Sex", "Age")
run_combine_maaslins(groups_to_analyse, mwi_variables_for_analysis, bang_variables_for_analysis, maaslin_functional_output_root_folder, 'bigmap')